library(tidyverse)
library(scales)
library(lubridate)
library(tidytext)
library(widyr)
library(ggraph)
library(topicmodels)
library(glue)
library(ldatuning)
library(cowplot)
library(tm)

library(dplyr)
library(stringr)
library(tidyr)
library(purrr)
library(psych)
library(tidytext)

# Estimation --------------------------------------------------------------

# Defining some parameters
pars <- list()
pars$savePlots <- 1
pars$prefix <- "graphs/ecb_ssm_"

# Loading the speeches data set
df_ecb <- readRDS("data/ecb.rds") %>% mutate(type = "ecb") ###before: read_rds
df_ecb2018 <- readRDS("data/ecb_2018.rds") %>% mutate(type = "ecb")
df_ssm <- readRDS("data/ssm_2018.rds") %>% mutate(type = "ssm")
df_ssm <- df_ssm %>% mutate(id = paste0("ssm_",1:nrow(df_ssm)))

# Since all the previous data cleaning was doner for the ecb.rds dataset only,
# we keep the old data set as it is and just look for new speeches in 2018.
# Then, we also need to make sure that the SSM speeches are not in the ECB speeches
# and, thus, counted twice.
exclude_ecb_ssm <- plyr::match_df(df_ecb, df_ssm, on = c("date","speaker","title")) 
exclude_ecb2018_ssm <- plyr::match_df(df_ecb2018, df_ssm, on = c("date","speaker","title"))
exclude_ecb <- plyr::match_df(df_ecb2018, df_ecb, on = c("date","speaker"))

# Add the speeches from the new dataset (excluding the SSM speeches)
df_add_ecb <- df_ecb2018 %>%
  filter(!id %in% exclude_ecb$id,
         !id %in% exclude_ecb2018_ssm$id,
         year(date) >= 2017)
df_add_ecb <- df_add_ecb %>%
  mutate(id = paste0("ecb_",3000:(3000+nrow(df_add_ecb)-1)))

# Exclude the SSM speeches from the old dataset
df_ecb <- df_ecb %>%
  filter(!id %in% exclude_ecb_ssm$id)

# Put all 3 datasets together
df <- rbind(df_ecb, df_add_ecb, df_ssm)

# Clean them by removing some speeches that errorneously got characterised as english
# on the website and other speeches that only contain slides or present the FSR.
df <- df %>% filter(
    !id %in% paste0("ecb_", c(23, 21, 30, 39, 119, 135, 146, 273,
                              274, 307, 370, 447, 796, 1882)),
    str_detect(url, "\\.en\\."),
    !str_detect(title, "[sS]lides|Q&A"),
    !str_detect(title, regex("parliament|ECOFIN|(press conference)", TRUE)), ###with regex ignore case...
    !str_detect(title, "Financial Stability Review"),
    date <= make_date(2018,4,30),
    date >= make_date(1998,5,1)
  )


# Count the number of speeches in each year and plot them
speeches_count <- df %>%
  group_by(year = floor_date(date, "year"), type) %>%
  count(year) %>%
  ungroup() %>%
  complete(year, type, fill = list(n = 0)) %>% 
  filter(year(year) <= 2018)

speeches_count$type <- factor(speeches_count$type,
                         levels = rev(unique(speeches_count$type)))

ggplot(speeches_count, aes(x = year, y = n)) +
  geom_area(aes(fill = type)) +
  theme_light() +
  labs(x = NULL, y = NULL,
     title = "Number of speeches per year")

# Count the number of speeches for each speaker
df %>%
  count(speaker, sort = TRUE)

# Define some words that will be removed from the text
# Speaker names:
speaker_names <- tolower(df$speaker) %>% unique() %>% str_split(" ") %>% flatten_chr()
# Manual stop words that nevertheless appear regularly
sw_man <- c("der", "die", "das", "und", "zu", "sehr", "geehrte", "damen", "herren",
            "auch", "ich", "fr", "ist", "den", "en", "auf", "les", "des", "de",
            "le", "la", "g", "", "dear", "mr", "president", "chart", "sheet", "slide")

# Prepare a data set with cleaned text, where all the numbers, stop words and punction are removed.
df_clean <- df
df_clean$text <- tolower(df_clean$text) %>%
# Potentially deal with the hyphen problem by uncommenting following line:
#  str_replace_all(c("co.ordination" = "coordination","co.operation" = "cooperation")) %>%
  removeNumbers() %>%
  removeWords(c(stop_words$word[stop_words$lexicon == "SMART"],speaker_names,sw_man)) %>%
  removePunctuation() %>%
  stripWhitespace()

# Count the frequency of unique words in each document
df_count <- df_clean %>%
  unnest_tokens(word, text, "words") %>%
  count(id, word, sort = TRUE) %>%
  filter(str_length(word) <= 25)

# Calculate the TF-IDF (term frequency, inverse document frequency)
df_count_idf <- df_count %>% 
  bind_tf_idf(word, id, n)

# Look at the mean and median of the TF-IDF
summary(df_count_idf$tf_idf)

# Set a threshold of "uninformative" words of 0.004 and remove all words below that threshold.
pars$tf_ifd_threshold <- 0.004
pars$tf_ifd_threshold_txt <- substr(toString(pars$tf_ifd_threshold),3,10) 
df_count_idf_filtered <- df_count_idf%>%
  filter(tf_idf > pars$tf_ifd_threshold) 

# Prepare a Document-Term-Matrix for the LDA algorithm.
dtm <- cast_dtm(df_count_idf_filtered, id, word, n)
dtm


# Following code runs the LDA algorithm for different number of topics and
# calculates the CaoJuan2009 metric. Eventually, the model with the lowest
# metric indicates the optimal number of topics.
# This code doesn't have to be run every time, since the execution time is
# very long (can take up to 15 hours). It is just to find the initial number
# of optimal topics in the very beginning.

# result <- FindTopicsNumber(
#   dtm,
#   topics = 5:50,
#   metrics = "CaoJuan2009",
#   method = "Gibbs",
#   control = list(seed = 77),
#   mc.cores = 4L,
#   verbose = TRUE
# )
# 
# ggplot(result, aes(x = topics, y = CaoJuan2009)) +
#   geom_line() +
#   geom_point() +
#   theme_light() +
#   scale_x_continuous(breaks = pretty_breaks(15)) +
#   labs(x = "Number of topics",
#        y = "Juan Cao (2009)")

# Above analysis suggests that 50 number of topics seems optimal.
pars$nrTopics <- 50

# Finally running the LDA algorithm with k=50 number of topics.
lda <- LDA(dtm, k = pars$nrTopics, control = list(seed = 234))

save.image(file = "prepared_data_lda.RData")



# Analysis ----------------------------------------------------------------

# This code can be run independently and only requires that the estimation part
# has been run at least one time before to produce the prepared_data_lda data set.

load("prepared_data_lda.RData")

source("plot_funcs_ph.R")

pars$savePlots <- 1
pars$prefix <- "graphs/"

# terms(lda) returns the word with the highest probability in each topic.
# This word can be used to automatically name the topics as a first approach.
trms <- terms(lda)
df_topics <- tibble(topic = 1:length(trms), topic_desc = paste0("Topic ", topic, ": ", trms))


# This defines a tibble with manually defined topic names.
df_topics_man <- tribble(~topic, ~topic_desc,
1, "Data and statistics",
2, "EU enlargement",
3, "Banking union",
4, "Asset bubbles/prices",
5, "Central bank communication, uncertainty and forward guidance",
6, "Real exchange rate",
7, "Payment and settlement systems",
8, "Monetary policy strategy and monetary analysis",
9, "Global flows and emerging economies",
10, "Accounting, information, governance and supervision",
11, "International trade",
12, "Policy institutions and Eurosystem",
13, "Greek program and crisis",
14, "Money (inter alia, electronic money, retail payments, unit of account, currency competition)",
15, "Stress tests and comprehensive assessment",
16, "EMU and European unity/culture",
17, "Financial structure and SME financing conditions",
18, "Market operations and liquidity",
19, "Labour productivity, competitiveness and growth",
20, "EMU completion",
21, "Banks and NPLs",
22, "Governing Council decisions, independence and accountability",
23, "Eurosystem setup, fintech and other",
24, "Negative rates and lower bound",
25, "Competition and regulation",
26, "Asset purchases",
27, "Household income and wealth",
28, "SEPA and retail payments",
29, "Crisis",
30, "Inflation and Phillips curve",
31, "Euro cash changeover",
32, "Capital requirements",
#33, "Inflation and Phillips curve",
34, "Growth (mainly European, but also Chinese, Asian and global, as well as role of monetary policy in growth)",
35, "Credit risk and management",
36, "Monetary policy, cycles and asset prices", 
37, "Convergence and accession",
38, "Structural reforms",
39, "Post-crisis recovery and sustained inflation",
40, "Single Supervisory Mechanism",
41, "Growth, productivity and investment",
42, "CCPs and derivatives clearing",
43, "Financial market risks and developments",
44, "Global regulatory reform",
45, "Fiscal policies, rules and imbalances",
46, "Post-trading arrangements, securities settlement and T2S",
47, "International role of the euro",
48, "Macroprudential policy and systemic risk",
49, "EMU, risk sharing, CMU and BU",
50, "Financial integration"
)


# Beta contains the word distribution within topics.
beta <- tidy(lda)

# Gamman contains the topic distribution within documents.
gamma <- tidy(lda, matrix = "gamma") %>% 
  left_join(df_topics)


#Join Inflation and Philips curve topics (30 & 33) for expositional purposes
gamma$topic[gamma$topic == 33] <- 30
gamma$topic_desc[gamma$topic_desc == "Topic 33: inflation"] <- "Topic 30: inflation"
df_topics<-df_topics[-c(33),]
trms<-trms[-c(33)]
df_topics_man$topic<- 1:49
df_topics$topic<- 1:length(df_topics_man$topic)
df_topics$topic_desc<-paste("Topic ", df_topics$topic, ": ", trms)
pars$nrTopics <- 49

for (i in 34:50){
  gamma$topic[gamma$topic == i] <- i-1  
  gamma$topic_desc[gamma$topic_desc == paste("Topic ", i, ": ", trms[[i-1]],sep="")] <- paste("Topic ", i-1, ": ", trms[[i-1]],sep="")
}


# Grouping the topics manually --------------------------------------------

width <- 3000
height <- 1800
res <- 200
pars$savePlots <- 1

# Add a column with the group for each topic
gammas_g <- gamma %>% mutate(group = case_when(
  topic %in% c(5,8,18,22,24,26,30,35,38,17) ~ 1,
  topic %in% c(3,4,10,15,21,25,32,34,39,41,42,43,47) ~ 2, 
  topic %in% c(6,19,33,37,40) ~ 3,
  topic %in% c(13,29,44) ~ 4,
  topic %in% c(7,14,28,46) ~ 5,
  topic %in% c(31) ~ 6,
  topic %in% c(2,16,20,23,36,48,49) ~ 7,
  topic %in% c(9,11,46) ~ 8,
  TRUE ~ 9   
))

# Define some group headings
df_groups <- tribble( ~groupNr,   ~groupName,
                      1, "Monetary policy and inflation",
                      2, "Financial instability, regulation and banking union",
                      3, "Growth, productivity and structural reforms",
                      4, "Fiscal policy, public debt and sovereign crisis",
                      5, "Payment and settlement systems",
                      6, "Banknotes, coins and cash changeover",
                      7, "EMU setup and reforms, financial integration and enlargement",
                      8, "International issues",
                      9, "Other"
)
# Save them into a factor variable for the legend
gammas_g$group_f <- factor(gammas_g$group,
                           levels = rev(df_groups$groupNr),
                           labels = rev(df_groups$groupName))

## Plot shares of all groups
plotDistTimeStack_allgroups(gammas_g, df, df_topics_man, pars, normalization = 0,
                            plot_title = " ",
                            plot_subtitle = " ",
                            abbr = "groups")


## Plot shares of selected groups

# normalization = 0: plots the topics in absolute number of speeches
# normalization = 1: plots the topics in share of speeches (where 100% are all speeches within a specific year)
# normalization = 2: plots the topics in share of the selected group (where 100% is the share of the group)

plotDistTimeStack_group(gammas_g, df, df_topics_man, pars, group_sel=1, normalization = 0,
                        plot_title = " ",
                        plot_subtitle = " ",
                        abbr = "MP")

